knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
options(repos = c(CRAN = "https://cran.stat.auckland.ac.nz/"))
required_pkgs <- c(
  "dplyr", "tidyr", "readr", "tibble", "janitor", "skimr",
  "recipes", "rsample", "tidymodels", "rpart", "kernlab",
  "FactoMineR", "factoextra", "caret" , "purrr", "plotly",
  "RColorBrewer","glue"
)
to_install <- setdiff(required_pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(required_pkgs, library, character.only = TRUE))
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ purrr        1.0.4
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ ggplot2      3.5.2     ✔ workflows    1.2.0
## ✔ infer        1.0.8     ✔ workflowsets 1.1.0
## ✔ modeldata    1.4.0     ✔ yardstick    1.3.2
## ✔ parsnip      1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()  masks scales::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:dials':
## 
##     buffer
## The following object is masked from 'package:scales':
## 
##     alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Data Load and Inspection

candidate <- c("../data/unzipped", "./data/unzipped", "data/unzipped")
data_dir  <- candidate[file.exists(candidate)][1]
if (is.na(data_dir)) stop("Cannot find data folder.")
path_annotated <- file.path(data_dir,"seeds_annotated.csv")
path_unlabeled <- file.path(data_dir,"seeds.csv")
annotated <- read_csv(path_annotated, locale=locale(decimal_mark=","), show_col_types=FALSE)
unlabeled <- read_csv(path_unlabeled, locale=locale(decimal_mark=","), show_col_types=FALSE)
has_annotated_index <- "...1" %in% names(annotated)  
has_unlabeled_index <- "...1" %in% names(unlabeled) 
if (has_annotated_index) annotated <- select(annotated,-...1)
if (has_unlabeled_index) unlabeled <- select(unlabeled,-...1)

glimpse(unlabeled)
## Rows: 13,511
## Columns: 16
## $ Area            <dbl> 28395, 28734, 29380, 30008, 30140, 30279, 30477, 30519…
## $ Perimeter       <dbl> 610.291, 638.018, 624.110, 645.884, 620.134, 634.927, …
## $ MajorAxisLength <dbl> 208.1781, 200.5248, 212.8261, 210.5580, 201.8479, 212.…
## $ MinorAxisLength <dbl> 173.8887, 182.7344, 175.9311, 182.5165, 190.2793, 181.…
## $ AspectRation    <dbl> 1.197191, 1.097356, 1.209713, 1.153638, 1.060798, 1.17…
## $ Eccentricity    <dbl> 0.5498122, 0.4117853, 0.5627273, 0.4986160, 0.3336797,…
## $ ConvexArea      <dbl> 28715, 29172, 29690, 30724, 30417, 30600, 30970, 30847…
## $ EquivDiameter   <dbl> 190.1411, 191.2728, 193.4109, 195.4671, 195.8965, 196.…
## $ Extent          <dbl> 0.7639225, 0.7839681, 0.7781132, 0.7826813, 0.7730980,…
## $ Solidity        <dbl> 0.9888560, 0.9849856, 0.9895588, 0.9766957, 0.9908932,…
## $ roundness       <dbl> 0.9580271, 0.8870336, 0.9478495, 0.9039364, 0.9848771,…
## $ Compactness     <dbl> 0.9133578, 0.9538608, 0.9087742, 0.9283288, 0.9705155,…
## $ ShapeFactor1    <dbl> 0.007331506, 0.006978659, 0.007243912, 0.007016729, 0.…
## $ ShapeFactor2    <dbl> 0.003147289, 0.003563624, 0.003047733, 0.003214562, 0.…
## $ ShapeFactor3    <dbl> 0.8342224, 0.9098505, 0.8258706, 0.8617944, 0.9419004,…
## $ ShapeFactor4    <dbl> 0.9987239, 0.9984303, 0.9990661, 0.9941988, 0.9991661,…
glimpse(annotated)
## Rows: 100
## Columns: 17
## $ Area            <dbl> 32708, 31590, 59344, 174621, 63096, 34550, 43880, 3307…
## $ Perimeter       <dbl> 657.630, 649.120, 944.279, 1591.312, 1010.607, 742.826…
## $ MajorAxisLength <dbl> 233.1297, 235.0561, 355.4248, 590.5549, 402.1450, 269.…
## $ MinorAxisLength <dbl> 178.9160, 171.3433, 214.7311, 379.9938, 202.8656, 164.…
## $ AspectRation    <dbl> 1.303012, 1.371843, 1.655208, 1.554117, 1.982322, 1.64…
## $ Eccentricity    <dbl> 0.6411054, 0.6845706, 0.7968680, 0.7654864, 0.8634357,…
## $ ConvexArea      <dbl> 33047, 31908, 60734, 177034, 64321, 35311, 44179, 3356…
## $ EquivDiameter   <dbl> 204.0714, 200.5533, 274.8802, 471.5234, 283.4366, 209.…
## $ Extent          <dbl> 0.7560795, 0.7523399, 0.7148932, 0.7663656, 0.7080528,…
## $ Solidity        <dbl> 0.9897419, 0.9900338, 0.9771133, 0.9863698, 0.9809549,…
## $ roundness       <dbl> 0.9503873, 0.9421271, 0.8363461, 0.8665541, 0.7763313,…
## $ Compactness     <dbl> 0.8753555, 0.8532147, 0.7733850, 0.7984414, 0.7048119,…
## $ ShapeFactor1    <dbl> 0.007127605, 0.007440839, 0.005989229, 0.003381924, 0.…
## $ ShapeFactor2    <dbl> 0.002581435, 0.002432400, 0.001321702, 0.000847844, 0.…
## $ ShapeFactor3    <dbl> 0.7662472, 0.7279753, 0.5981244, 0.6375087, 0.4967599,…
## $ ShapeFactor4    <dbl> 0.9984291, 0.9986676, 0.9900206, 0.9907632, 0.9847381,…
## $ Class           <chr> "D", "D", "C", "B", "E", "D", "F", "D", "F", "G", "D",…
names(annotated) <- trimws(names(annotated))
annotated <- mutate(annotated, Class=as.factor(Class))
annotated |> glimpse()
## Rows: 100
## Columns: 17
## $ Area            <dbl> 32708, 31590, 59344, 174621, 63096, 34550, 43880, 3307…
## $ Perimeter       <dbl> 657.630, 649.120, 944.279, 1591.312, 1010.607, 742.826…
## $ MajorAxisLength <dbl> 233.1297, 235.0561, 355.4248, 590.5549, 402.1450, 269.…
## $ MinorAxisLength <dbl> 178.9160, 171.3433, 214.7311, 379.9938, 202.8656, 164.…
## $ AspectRation    <dbl> 1.303012, 1.371843, 1.655208, 1.554117, 1.982322, 1.64…
## $ Eccentricity    <dbl> 0.6411054, 0.6845706, 0.7968680, 0.7654864, 0.8634357,…
## $ ConvexArea      <dbl> 33047, 31908, 60734, 177034, 64321, 35311, 44179, 3356…
## $ EquivDiameter   <dbl> 204.0714, 200.5533, 274.8802, 471.5234, 283.4366, 209.…
## $ Extent          <dbl> 0.7560795, 0.7523399, 0.7148932, 0.7663656, 0.7080528,…
## $ Solidity        <dbl> 0.9897419, 0.9900338, 0.9771133, 0.9863698, 0.9809549,…
## $ roundness       <dbl> 0.9503873, 0.9421271, 0.8363461, 0.8665541, 0.7763313,…
## $ Compactness     <dbl> 0.8753555, 0.8532147, 0.7733850, 0.7984414, 0.7048119,…
## $ ShapeFactor1    <dbl> 0.007127605, 0.007440839, 0.005989229, 0.003381924, 0.…
## $ ShapeFactor2    <dbl> 0.002581435, 0.002432400, 0.001321702, 0.000847844, 0.…
## $ ShapeFactor3    <dbl> 0.7662472, 0.7279753, 0.5981244, 0.6375087, 0.4967599,…
## $ ShapeFactor4    <dbl> 0.9984291, 0.9986676, 0.9900206, 0.9907632, 0.9847381,…
## $ Class           <fct> D, D, C, B, E, D, F, D, F, G, D, F, G, E, C, G, F, G, …
head(annotated)
## # A tibble: 6 × 17
##     Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
##    <dbl>     <dbl>           <dbl>           <dbl>        <dbl>        <dbl>
## 1  32708      658.            233.            179.         1.30        0.641
## 2  31590      649.            235.            171.         1.37        0.685
## 3  59344      944.            355.            215.         1.66        0.797
## 4 174621     1591.            591.            380.         1.55        0.765
## 5  63096     1011.            402.            203.         1.98        0.863
## 6  34550      743.            270.            164.         1.64        0.793
## # ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## #   Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## #   ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <fct>
cat("NAs in annotated:", sum(is.na(annotated)), "\n")
## NAs in annotated: 0
names(unlabeled) <- trimws(names(unlabeled))
unlabeled |> glimpse()
## Rows: 13,511
## Columns: 16
## $ Area            <dbl> 28395, 28734, 29380, 30008, 30140, 30279, 30477, 30519…
## $ Perimeter       <dbl> 610.291, 638.018, 624.110, 645.884, 620.134, 634.927, …
## $ MajorAxisLength <dbl> 208.1781, 200.5248, 212.8261, 210.5580, 201.8479, 212.…
## $ MinorAxisLength <dbl> 173.8887, 182.7344, 175.9311, 182.5165, 190.2793, 181.…
## $ AspectRation    <dbl> 1.197191, 1.097356, 1.209713, 1.153638, 1.060798, 1.17…
## $ Eccentricity    <dbl> 0.5498122, 0.4117853, 0.5627273, 0.4986160, 0.3336797,…
## $ ConvexArea      <dbl> 28715, 29172, 29690, 30724, 30417, 30600, 30970, 30847…
## $ EquivDiameter   <dbl> 190.1411, 191.2728, 193.4109, 195.4671, 195.8965, 196.…
## $ Extent          <dbl> 0.7639225, 0.7839681, 0.7781132, 0.7826813, 0.7730980,…
## $ Solidity        <dbl> 0.9888560, 0.9849856, 0.9895588, 0.9766957, 0.9908932,…
## $ roundness       <dbl> 0.9580271, 0.8870336, 0.9478495, 0.9039364, 0.9848771,…
## $ Compactness     <dbl> 0.9133578, 0.9538608, 0.9087742, 0.9283288, 0.9705155,…
## $ ShapeFactor1    <dbl> 0.007331506, 0.006978659, 0.007243912, 0.007016729, 0.…
## $ ShapeFactor2    <dbl> 0.003147289, 0.003563624, 0.003047733, 0.003214562, 0.…
## $ ShapeFactor3    <dbl> 0.8342224, 0.9098505, 0.8258706, 0.8617944, 0.9419004,…
## $ ShapeFactor4    <dbl> 0.9987239, 0.9984303, 0.9990661, 0.9941988, 0.9991661,…
head(unlabeled)
## # A tibble: 6 × 16
##    Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
##   <dbl>     <dbl>           <dbl>           <dbl>        <dbl>        <dbl>
## 1 28395      610.            208.            174.         1.20        0.550
## 2 28734      638.            201.            183.         1.10        0.412
## 3 29380      624.            213.            176.         1.21        0.563
## 4 30008      646.            211.            183.         1.15        0.499
## 5 30140      620.            202.            190.         1.06        0.334
## 6 30279      635.            213.            182.         1.17        0.520
## # ℹ 10 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## #   Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## #   ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>
cat("NAs in unlabeled:", sum(is.na(unlabeled)), "\n")
## NAs in unlabeled: 0

3. Z-score Normalisation

unlabeled_scaled <- select(unlabeled, where(is.numeric)) |>
  scale() |> as_tibble()

4. K‑means from Scratch

delta_to_centroid <- function(X, centroid) {
  n <- nrow(X)
  p <- ncol(X)
  rowSums((X - matrix(centroid, n, p, byrow = TRUE))^2)
  }

delta_matrix <- function(X, centroids) {
  k <- nrow(centroids)
  sapply(seq_len(k),function(i) delta_to_centroid(X, centroids[i, ]))
  }

f_kmeans <- function(data, k, max_iter = 100, start = 10) {
  set.seed(82171165)
  X <- as.matrix(data)
  n <- nrow(X)
  p <- ncol(X)

  centroids    <- X[sample(n), , drop = FALSE][1:k, ] #  k random centroids
  clusters     <- integer(n)
  iter_reached <- NA

  for (iter in seq_len(max_iter)) {
    distances     <- delta_matrix(X, centroids)
    clusters_new  <- max.col(-distances) #idx of first largest
    has_converged <- !anyNA(clusters_new) && all(clusters_new == clusters)
  # cat(iter, "of", max_iter, "\n"); flush.console()

    if (has_converged) {
      iter_reached <- iter
      cat("converge @ iter_reached:", iter_reached, "\n")
      break
    }

    clusters <- clusters_new

    # recompute centroids
    for (j in seq_len(k)) {
      idx <- which(clusters == j)
      if (length(idx) > 0) {centroids[j, ] <- colMeans(X[idx, , drop = FALSE])
        }
      }
    }

  list(clusters  = clusters,      # return values
       centroids = centroids,
       iter      = iter_reached
      )
   }

compute_wss <- function(k, X, max_iter, start) {
  res <- tryCatch(
    f_kmeans(X, k, max_iter, start),
    error = function(e) NULL
  )
  if (is.null(res) || is.null(res$tot.withinss)) {
    return(NA_real_)  # must return a length-1 numeric
  }
  return(as.numeric(res$tot.withinss))  # force numeric(1)
}


k_vals <- 2:10
unlabeled_use <- unlabeled_scaled
if (!exists("max_iter")) max_iter <- 100
if (!exists("start")) start <- 10
wss_values <- sapply( k_vals, function(k) compute_wss(k, unlabeled_use, max_iter, start))
## converge @ iter_reached: 8 
## converge @ iter_reached: 8 
## converge @ iter_reached: 38 
## converge @ iter_reached: 25 
## converge @ iter_reached: 22 
## converge @ iter_reached: 25 
## converge @ iter_reached: 23 
## converge @ iter_reached: 18 
## converge @ iter_reached: 57
testing <- FALSE
if (testing) {
  message("🟡 Quick test: 500 rows, k=2:4, iter.max=5")
  unlabeled_use <- unlabeled_scaled[1:500,]; k_vals <- 2:4; max_iter <- 5
} else {
  message("🟢 Full run: all rows, k=2:10, iter.max=100")
  unlabeled_use <- unlabeled_scaled; k_vals <- 2:10; max_iter <- 100
}
k_vals <- 2:10
unlabeled_use <- unlabeled_scaled
wss_values <- vapply(k_vals, function(k) compute_wss(k, unlabeled_use, max_iter, start), numeric(1))
## converge @ iter_reached: 8 
## converge @ iter_reached: 8 
## converge @ iter_reached: 38 
## converge @ iter_reached: 25 
## converge @ iter_reached: 22 
## converge @ iter_reached: 25 
## converge @ iter_reached: 23 
## converge @ iter_reached: 18 
## converge @ iter_reached: 57
unlabeled_use <- unlabeled_scaled
if (exists("wss_values") && any(is.finite(wss_values))) {
plot(k_vals, wss_values, type="b", pch=19,
     xlab = "k",
     ylab = expression(paste("Within-cluster ", sum((x[i] - c[k])^2))),
     main = if (testing) "Elbow (testing)" else "Elbow (full)",
     xaxt = "n")
##     xlab="Number of clusters (k)", 
#     ylab=expression("Within-cluster " * sum((x[i]-c[k])^2)),
#     main=if(testing) "Elbow (testing)" else "Elbow (full)", xaxt="n")
axis(1, at = k_vals)
#                       side=1,line=3,cex=0.8)


}
debug_tbl <- purrr::map_dfr(k_vals, function(k) {
  res <- f_kmeans(unlabeled_scaled,k,max_iter)
  tot_within <- compute_wss(k,unlabeled_use,max_iter)
  grand_mean <- colMeans(unlabeled_use)
  totss <- sum(rowSums((as.matrix(unlabeled_use) -
           matrix(grand_mean,nrow(unlabeled_use),ncol(unlabeled_use),byrow=TRUE))^2))
  betweenss <- totss - tot_within
  tibble(k, tot_within, totss, betweenss,
         identity_ok=abs(totss-(betweenss+tot_within))<1e-8)
})
## converge @ iter_reached: 8 
## converge @ iter_reached: 8 
## converge @ iter_reached: 8 
## converge @ iter_reached: 8 
## converge @ iter_reached: 38 
## converge @ iter_reached: 38 
## converge @ iter_reached: 25 
## converge @ iter_reached: 25 
## converge @ iter_reached: 22 
## converge @ iter_reached: 22 
## converge @ iter_reached: 25 
## converge @ iter_reached: 25 
## converge @ iter_reached: 23 
## converge @ iter_reached: 23 
## converge @ iter_reached: 18 
## converge @ iter_reached: 18 
## converge @ iter_reached: 57 
## converge @ iter_reached: 57
print(debug_tbl)
## # A tibble: 9 × 5
##       k tot_within  totss betweenss identity_ok
##   <int>      <dbl>  <dbl>     <dbl> <lgl>      
## 1     2         NA 216160        NA NA         
## 2     3         NA 216160        NA NA         
## 3     4         NA 216160        NA NA         
## 4     5         NA 216160        NA NA         
## 5     6         NA 216160        NA NA         
## 6     7         NA 216160        NA NA         
## 7     8         NA 216160        NA NA         
## 8     9         NA 216160        NA NA         
## 9    10         NA 216160        NA NA
unlabeled_use <- unlabeled_scaled
k_vals <- 2:10
max_iter <- 100
start <- 10
wss_values <- vapply(
  k_vals,
  function(k) compute_wss(k, unlabeled_use, max_iter, start),
  numeric(1)
)
## converge @ iter_reached: 8 
## converge @ iter_reached: 8 
## converge @ iter_reached: 38 
## converge @ iter_reached: 25 
## converge @ iter_reached: 22 
## converge @ iter_reached: 25 
## converge @ iter_reached: 23 
## converge @ iter_reached: 18 
## converge @ iter_reached: 57
if (all(is.na(wss_values)) || all(!is.finite(wss_values))) {
  warning("All clustering attempts failed. No elbow plot can be drawn.")
} else {
if (all(is.na(wss_values)) || all(!is.finite(wss_values))) {
  warning("All clustering attempts failed. No elbow plot can be drawn.")
} else {
if (exists("wss_values") && any(is.finite(wss_values))) {
plot(
  k_vals, wss_values, type="b", pch=19,
  xlab = "k",
  ylab = expression(paste("Within-cluster ", sum((x[i] - c[k])^2))),
  main = if (testing) "Elbow (testing)" else "Elbow (full)",
  xaxt = "n"
)
axis(1, at = k_vals)
}
}
}

5. Final Clustering

# Final clustering with user-selected k
k_best <- 7
res_final <- f_kmeans(
  unlabeled_scaled,
  k_best,
  max_iter,
  start
)
## converge @ iter_reached: 25
# Attach cluster assignment to unlabeled_use
unlabeled_use$cluster <- res_final$clusters

# Optional: also compute a label mapping using majority vote (saved separately)
if (exists("annotated")) {
  label_col <- "Class"
  annotated_subset <- annotated |> filter(!is.na(.data[[label_col]]))

  # Assign clusters to annotated points (assumes same row indexing)
  annotated_subset$cluster <- res_final$clusters[as.integer(rownames(annotated_subset))]

  # Determine the most common class in each cluster
  mapping <- annotated_subset |>
    count(.data[[label_col]], cluster, name = "n") |>
    group_by(cluster) |>
    slice_max(n, n = 1, with_ties = FALSE) |>
    ungroup()

  cluster_map_vote <- setNames(as.character(mapping[[label_col]]), mapping$cluster)

  # Assign vote-based labels separately
  unlabeled_use$label_vote <- cluster_map_vote[as.character(unlabeled_use$cluster)]
}

# Print diagnostic output
res_final$iter
## [1] 25
table(unlabeled_use$cluster)
## 
##    1    2    3    4    5    6    7 
## 2011  515 3137 1763 2461 3089  535
# Use centroids of clusters and annotated class groups to align labels by proximity

# Get centroids of clusters (in scaled unlabeled data)
cluster_centroids <- unlabeled_scaled %>%
  as.data.frame() %>%
  mutate(cluster = res_final$clusters) %>%
  group_by(cluster) %>%
  summarise(across(everything(), mean), .groups = "drop")

# Get centroids of known classes (in scaled annotated data)
annotated_scaled <- annotated %>%
  mutate(Class = as.factor(Class)) %>%
  select(-Class) %>%
  scale() %>%
  as.data.frame()
annotated_centroids <- annotated_scaled %>%
  mutate(Class = annotated$Class) %>%
  group_by(Class) %>%
  summarise(across(everything(), mean), .groups = "drop")

# Compute distances between each cluster centroid and class centroid
cluster_matrix <- as.matrix(select(cluster_centroids, -cluster))
class_matrix <- as.matrix(select(annotated_centroids, -Class))
distance_matrix <- as.matrix(dist(rbind(cluster_matrix, class_matrix)))

# Extract top-left block: distances between clusters and classes
k <- nrow(cluster_matrix)
m <- nrow(class_matrix)
prox_matrix <- distance_matrix[1:k, (k + 1):(k + m)]

# Assign each cluster to closest class
closest_class_index <- apply(prox_matrix, 1, which.min)
closest_class_labels <- annotated_centroids$Class[closest_class_index]

# Map cluster numbers to class labels
cluster_to_label <- setNames(as.character(closest_class_labels), cluster_centroids$cluster)
unlabeled_use$label <- cluster_to_label[as.character(unlabeled_use$cluster)]
pca_result <- PCA(unlabeled_scaled, graph = FALSE)
pca_coords <- as.data.frame(pca_result$ind$coord[, 1:3])
colnames(pca_coords) <- c("PC1", "PC2", "PC3")

# Use class labels instead of cluster numbers
pca_coords$Class <- unlabeled_use$label

# Color palette
pal <- brewer.pal(max(3, length(unique(pca_coords$Class))), "Set2")

# Plot
plot_ly(data = pca_coords,
        x = ~PC1, y = ~PC2, z = ~PC3,
        color = ~Class, colors = pal,
        type = "scatter3d", mode = "markers",
        marker = list(size = 3)) |>
  layout(title = "PCA – Colored by Assigned Class Labels",
         legend = list(title = list(text = "Class")),
         scene = list(
           xaxis = list(title = "PC1"),
           yaxis = list(title = "PC2"),
           zaxis = list(title = "PC3")
         ))
if (!exists("res.pca")||!exists("res.hcpc")) {
  hcpc_data<-scale(unlabeled)
  res.pca<-PCA(hcpc_data,graph=FALSE)
  res.hcpc<-HCPC(res.pca,graph=FALSE)
}
pca_coords<-as.data.frame(res.pca$ind$coord[,1:3]);
colnames(pca_coords)<-c("PC1","PC2","PC3");
pca_coords$cluster <- factor(unlabeled_use$cluster)
pal<-brewer.pal(max(3,length(levels(pca_coords$cluster))),"Set2")
if (knitr::is_html_output()) {
  plot_ly(data=pca_coords,x=~PC1,y=~PC2,z=~PC3,type="scatter3d",mode="markers",
          color=~cluster,colors=pal,marker=list(size=4))|>
    layout(scene=list(xaxis=list(title="PC1"),yaxis=list(title="PC2"),zaxis=list(title="PC3")),
           legend=list(title=list(text="Cluster")))
} else {
  fviz_cluster(res.hcpc,geom="point",repel=TRUE,show.clust.cent=TRUE,axes=c(1,2),palette=pal)
}
pca_coords2 <- as.data.frame(pca_result$ind$coord[, 1:3])
colnames(pca_coords2) <- c("PC1", "PC2", "PC3")
pca_coords2$label <- unlabeled_use$label_proximity

pal2 <- brewer.pal(max(3, length(unique(pca_coords2$label))), "Set2")

plot_ly(data = pca_coords2, 
        x = ~PC1, y = ~PC2, z = ~PC3,
        color = ~label, colors = pal2,
        type = "scatter3d", mode = "markers",
        marker = list(size = 3)) |>
  layout(title = "PCA - Colored by Class Label (Centroid Proximity)",
         legend = list(title = list(text = "Class")),
         scene = list(
           xaxis = list(title = "PC1"),
           yaxis = list(title = "PC2"),
           zaxis = list(title = "PC3")
         ))

6. Classification of Annotated Data

# Ensure Class is a factor
annotated$Class <- as.factor(annotated$Class)

# Remove class temporarily, scale, then add back
ann <- select(annotated, -Class) |> scale() |> as_tibble()
ann$Class <- annotated$Class

# Create training/test split
set.seed(82171165)
idx <- createDataPartition(ann$Class, p = 0.7, list = FALSE)
train <- ann[idx, ]
test  <- ann[-idx, ]

# Train classifier
model <- train(Class ~ ., data = train, method = "svmRadial")

# Predict and evaluate
pred <- predict(model, test)
confusionMatrix(pred, test$Class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 1 0 0 0 0 0 0
##          B 0 1 0 0 0 0 0
##          C 0 0 4 0 1 0 0
##          D 0 0 0 6 0 0 0
##          E 0 0 0 0 1 0 0
##          F 0 0 0 0 0 6 0
##          G 1 0 0 0 0 0 6
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9259          
##                  95% CI : (0.7571, 0.9909)
##     No Information Rate : 0.2222          
##     P-Value [Acc > NIR] : 1.014e-14       
##                                           
##                   Kappa : 0.9085          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity           0.50000  1.00000   1.0000   1.0000  0.50000   1.0000
## Specificity           1.00000  1.00000   0.9565   1.0000  1.00000   1.0000
## Pos Pred Value        1.00000  1.00000   0.8000   1.0000  1.00000   1.0000
## Neg Pred Value        0.96154  1.00000   1.0000   1.0000  0.96154   1.0000
## Prevalence            0.07407  0.03704   0.1481   0.2222  0.07407   0.2222
## Detection Rate        0.03704  0.03704   0.1481   0.2222  0.03704   0.2222
## Detection Prevalence  0.03704  0.03704   0.1852   0.2222  0.03704   0.2222
## Balanced Accuracy     0.75000  1.00000   0.9783   1.0000  0.75000   1.0000
##                      Class: G
## Sensitivity            1.0000
## Specificity            0.9524
## Pos Pred Value         0.8571
## Neg Pred Value         1.0000
## Prevalence             0.2222
## Detection Rate         0.2222
## Detection Prevalence   0.2593
## Balanced Accuracy      0.9762
pca_result <- PCA(unlabeled_scaled, graph = FALSE)
pca_coords <- as.data.frame(pca_result$ind$coord[, 1:3])
colnames(pca_coords) <- c("PC1", "PC2", "PC3")

# Use Class labels derived from cluster mapping
pca_coords$Class <- unlabeled_use$label

# Color palette
pal <- brewer.pal(max(3, length(unique(pca_coords$Class))), "Set2")

# Plot
plot_ly(data = pca_coords,
        x = ~PC1, y = ~PC2, z = ~PC3,
        color = ~Class, colors = pal,
        type = "scatter3d", mode = "markers",
        marker = list(size = 3)) |>
  layout(title = "PCA – Colored by Assigned Class Labels",
         legend = list(title = list(text = "Class")),
         scene = list(
           xaxis = list(title = "PC1"),
           yaxis = list(title = "PC2"),
           zaxis = list(title = "PC3")
         ))

7. HCPC 3D Factor Map

if (!exists("res.pca")||!exists("res.hcpc")) {
  hcpc_data<-scale(unlabeled)
  res.pca<-PCA(hcpc_data,graph=FALSE)
  res.hcpc<-HCPC(res.pca,graph=FALSE)
}
pca_coords<-as.data.frame(res.pca$ind$coord[,1:3]);
colnames(pca_coords)<-c("PC1","PC2","PC3");
pca_coords$cluster <- factor(unlabeled_use$cluster)
pal<-brewer.pal(max(3,length(levels(pca_coords$cluster))),"Set2")
if (knitr::is_html_output()) {
  plot_ly(data=pca_coords,x=~PC1,y=~PC2,z=~PC3,type="scatter3d",mode="markers",
          color=~cluster,colors=pal,marker=list(size=4))|>
    layout(scene=list(xaxis=list(title="PC1"),yaxis=list(title="PC2"),zaxis=list(title="PC3")),
           legend=list(title=list(text="Cluster")))
} else {
  fviz_cluster(res.hcpc,geom="point",repel=TRUE,show.clust.cent=TRUE,axes=c(1,2),palette=pal)
}
# Combine cluster info with PCA coordinates
pca_clustered <- pca_coords |>
  mutate(cluster = unlabeled_use$cluster)

# Select one cluster to highlight
target_cluster <- 3
highlight <- filter(pca_clustered, cluster == target_cluster)

# Plot just the selected cluster in 2D
ggplot(highlight, aes(x = PC1, y = PC2)) +
  geom_point(color = "steelblue", alpha = 0.7) +
  labs(
    title = glue("Cluster {target_cluster}: PCA Projection"),
    x = "PC1", y = "PC2"
  ) +
  theme_minimal()

# Set which cluster to highlight
target_cluster <- 3

# Compute PCA coordinates if not already done
if (!exists("pca_coords")) {
  pca_coords <- as.data.frame(res.pca$ind$coord[, 1:2])
  colnames(pca_coords) <- c("PC1", "PC2")
}

# Add cluster assignment
pca_coords$cluster <- factor(res_final$clusters)

# Filter only points from the target cluster
highlight <- pca_coords |> filter(cluster == target_cluster)

# Plot only this cluster
ggplot(highlight, aes(x = PC1, y = PC2)) +
  geom_point(color = "steelblue", size = 3, alpha = 0.6) +
  labs(
    title = glue("Points from Cluster {target_cluster}"),
    x = "PC1", y = "PC2"
  ) +
  theme_minimal()